######     TRANSPOSABLE ELEMENTS     ######

#read in tables of each class of transposable elements, remove alternative haplotypes, remove pseudo elements by minimum length and make Granges objects.
SVA_merged <- read.table("UCSCGenomeBrowser_hg38_SVA_bigger1000bp_merged5prime_SimpleRepeats_20bpGap.bed", stringsAsFactors=F, sep="\t")
SVA_merged$Length <- abs(SVA_merged$V2 - SVA_merged$V3)
colnames(SVA_merged)[1:6] <- c("Chr","Start","End", "Name", "Score", "Strand")
Grange_SVA_merged <- makeGRangesFromDataFrame(SVA_merged, keep.extra.columns=T)
Grange_SVA_A <- Grange_SVA_merged[which(Grange_SVA_merged$Name == "SVA_A"),]
Grange_SVA_B <- Grange_SVA_merged[which(Grange_SVA_merged$Name == "SVA_B"),]
Grange_SVA_C <- Grange_SVA_merged[which(Grange_SVA_merged$Name == "SVA_C"),]
Grange_SVA_D <- Grange_SVA_merged[which(Grange_SVA_merged$Name == "SVA_D"),]
Grange_SVA_E <- Grange_SVA_merged[which(Grange_SVA_merged$Name == "SVA_E"),]
Grange_SVA_F <- Grange_SVA_merged[which(Grange_SVA_merged$Name == "SVA_F"),]

AluYa5 <- read.table("UCSCGenomeBrowser_hg38_AluYa5.txt", stringsAsFactors=F, sep="\t")
AluYa5$Length <- abs(AluYa5$V2 - AluYa5$V3)
colnames(AluYa5)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
a <- c(1:22,"X","Y")
b <- paste0("chr",a)
AluYa5 <- AluYa5[which(AluYa5$Chr %in% b),]
Grange_AluYa5 <- makeGRangesFromDataFrame(AluYa5, keep.extra.columns=T)
Grange_AluYa5 <- Grange_AluYa5[which(Grange_AluYa5$Length > 300),]

AluYb8 <- read.table("UCSCGenomeBrowser_hg38_AluYb8.txt", stringsAsFactors=F, sep="\t")
AluYb8$Length <- abs(AluYb8$V2 - AluYb8$V3)
colnames(AluYb8)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
AluYb8 <- AluYb8[which(AluYb8$Chr %in% b),]
Grange_AluYb8 <- makeGRangesFromDataFrame(AluYb8, keep.extra.columns=T)
Grange_AluYb8 <- Grange_AluYb8[which(Grange_AluYb8$Length > 300),]

AluYb9 <- read.table("UCSCGenomeBrowser_hg38_AluYb9.txt", stringsAsFactors=F, sep="\t")
AluYb9$Length <- abs(AluYb9$V2 - AluYb9$V3)
colnames(AluYb9)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
AluYb9 <- AluYb9[which(AluYb9$Chr %in% b),]
Grange_AluYb9 <- makeGRangesFromDataFrame(AluYb9, keep.extra.columns=T)
Grange_AluYb9 <- Grange_AluYb9[which(Grange_AluYb9$Length > 300),]

AluYd8 <- read.table("UCSCGenomeBrowser_hg38_AluYd8.txt", stringsAsFactors=F, sep="\t")
AluYd8$Length <- abs(AluYd8$V2 - AluYd8$V3)
colnames(AluYd8)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
AluYd8 <- AluYd8[which(AluYd8$Chr %in% b),]
Grange_AluYd8 <- makeGRangesFromDataFrame(AluYd8, keep.extra.columns=T)
Grange_AluYd8 <- Grange_AluYd8[which(Grange_AluYd8$Length > 250),]

AluYe5 <- read.table("UCSCGenomeBrowser_hg38_AluYe5.txt", stringsAsFactors=F, sep="\t")
AluYe5$Length <- abs(AluYe5$V2 - AluYe5$V3)
colnames(AluYe5)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
AluYe5 <- AluYe5[which(AluYe5$Chr %in% b),]
Grange_AluYe5 <- makeGRangesFromDataFrame(AluYe5, keep.extra.columns=T)
Grange_AluYe5 <- Grange_AluYe5[which(Grange_AluYe5$Length > 300),]

AluYg6 <- read.table("UCSCGenomeBrowser_hg38_AluYg6.txt", stringsAsFactors=F, sep="\t")
AluYg6$Length <- abs(AluYg6$V2 - AluYg6$V3)
colnames(AluYg6)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
AluYg6 <- AluYg6[which(AluYg6$Chr %in% b),]
Grange_AluYg6 <- makeGRangesFromDataFrame(AluYg6, keep.extra.columns=T)
Grange_AluYg6 <- Grange_AluYg6[which(Grange_AluYg6$Length > 300),]

AluYi6 <- read.table("UCSCGenomeBrowser_hg38_AluYi6.txt", stringsAsFactors=F, sep="\t")
AluYi6$Length <- abs(AluYi6$V2 - AluYi6$V3)
colnames(AluYi6)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
AluYi6 <- AluYi6[which(AluYi6$Chr %in% b),]
Grange_AluYi6 <- makeGRangesFromDataFrame(AluYi6, keep.extra.columns=T)
Grange_AluYi6 <- Grange_AluYi6[which(Grange_AluYi6$Length > 300),]

AluYk4 <- read.table("UCSCGenomeBrowser_hg38_AluYk4.txt", stringsAsFactors=F, sep="\t")
AluYk4$Length <- abs(AluYk4$V2 - AluYk4$V3)
colnames(AluYk4)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
AluYk4 <- AluYk4[which(AluYk4$Chr %in% b),]
Grange_AluYk4 <- makeGRangesFromDataFrame(AluYk4, keep.extra.columns=T)
Grange_AluYk4 <- Grange_AluYk4[which(Grange_AluYk4$Length > 300),]

HERVK_int <- read.table("UCSCGenomeBrowser_hg38_HERVK-int.txt", stringsAsFactors=F, sep="\t")
HERVK_int$Length <- abs(HERVK_int$V2 - HERVK_int$V3)
colnames(HERVK_int)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
HERVK_int <- HERVK_int[which(HERVK_int$Chr %in% b),]
Grange_HERVK_int <- makeGRangesFromDataFrame(HERVK_int, keep.extra.columns=T)
Grange_HERVK_int <- Grange_HERVK_int[which(Grange_HERVK_int$Length > 1600),]

L1HS <- read.table("UCSCGenomeBrowser_hg38_L1HS.txt", stringsAsFactors=F, sep="\t")
L1HS$Length <- abs(L1HS$V2 - L1HS$V3)
colnames(L1HS)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
L1HS <- L1HS[which(L1HS$Chr %in% b),]
Grange_L1HS <- makeGRangesFromDataFrame(L1HS, keep.extra.columns=T)
Grange_L1HS <- Grange_L1HS[which(Grange_L1HS$Length > 6000),]

L1PA2 <- read.table("UCSCGenomeBrowser_hg38_L1PA2.txt", stringsAsFactors=F, sep="\t")
L1PA2$Length <- abs(L1PA2$V2 - L1PA2$V3)
colnames(L1PA2)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
L1PA2 <- L1PA2[which(L1PA2$Chr %in% b),]
Grange_L1PA2 <- makeGRangesFromDataFrame(L1PA2, keep.extra.columns=T)
Grange_L1PA2 <- Grange_L1PA2[which(Grange_L1PA2$Length > 6000),]

LTR12C <- read.table("UCSCGenomeBrowser_hg38_LTR12C.txt", stringsAsFactors=F, sep="\t")
LTR12C$Length <- abs(LTR12C$V2 - LTR12C$V3)
colnames(LTR12C)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
LTR12C <- LTR12C[which(LTR12C$Chr %in% b),]
Grange_LTR12C <- makeGRangesFromDataFrame(LTR12C, keep.extra.columns=T)
Grange_LTR12C <- Grange_LTR12C[which(Grange_LTR12C$Length > 1000),]

LTR12E <- read.table("UCSCGenomeBrowser_hg38_LTR12E.txt", stringsAsFactors=F, sep="\t")
LTR12E$Length <- abs(LTR12E$V2 - LTR12E$V3)
colnames(LTR12E)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
LTR12E <- LTR12E[which(LTR12E$Chr %in% b),]
Grange_LTR12E <- makeGRangesFromDataFrame(LTR12E, keep.extra.columns=T)
Grange_LTR12E <- Grange_LTR12E[which(Grange_LTR12E$Length > 1000),]

LTR22B1 <- read.table("UCSCGenomeBrowser_hg38_LTR22B1.txt", stringsAsFactors=F, sep="\t")
LTR22B1$Length <- abs(LTR22B1$V2 - LTR22B1$V3)
colnames(LTR22B1)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
LTR22B1 <- LTR22B1[which(LTR22B1$Chr %in% b),]
Grange_LTR22B1 <- makeGRangesFromDataFrame(LTR22B1, keep.extra.columns=T)
Grange_LTR22B1 <- Grange_LTR22B1[which(Grange_LTR22B1$Length > 450),]

LTR22C2 <- read.table("UCSCGenomeBrowser_hg38_LTR22C2.txt", stringsAsFactors=F, sep="\t")
LTR22C2$Length <- abs(LTR22C2$V2 - LTR22C2$V3)
colnames(LTR22C2)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
LTR22C2 <- LTR22C2[which(LTR22C2$Chr %in% b),]
Grange_LTR22C2 <- makeGRangesFromDataFrame(LTR22C2, keep.extra.columns=T)
Grange_LTR22C2 <- Grange_LTR22C2[which(Grange_LTR22C2$Length > 450),]

LTR5_Hs <- read.table("UCSCGenomeBrowser_hg38_LTR5_Hs.txt", stringsAsFactors=F, sep="\t")
LTR5_Hs$Length <- abs(LTR5_Hs$V2 - LTR5_Hs$V3)
colnames(LTR5_Hs)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
LTR5_Hs <- LTR5_Hs[which(LTR5_Hs$Chr %in% b),]
Grange_LTR5_Hs <- makeGRangesFromDataFrame(LTR5_Hs, keep.extra.columns=T)
Grange_LTR5_Hs <- Grange_LTR5_Hs[which(Grange_LTR5_Hs$Length > 900),]

LTR7 <- read.table("UCSCGenomeBrowser_hg38_LTR7.txt", stringsAsFactors=F, sep="\t")
LTR7$Length <- abs(LTR7$V2 - LTR7$V3)
colnames(LTR7)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
LTR7 <- LTR7[which(LTR7$Chr %in% b),]
Grange_LTR7 <- makeGRangesFromDataFrame(LTR7, keep.extra.columns=T)
Grange_LTR7 <- Grange_LTR7[which(Grange_LTR7$Length > 350),]

MER52A <- read.table("UCSCGenomeBrowser_hg38_MER52A.txt", stringsAsFactors=F, sep="\t")
MER52A$Length <- abs(MER52A$V2 - MER52A$V3)
colnames(MER52A)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
MER52A <- MER52A[which(MER52A$Chr %in% b),]
Grange_MER52A <- makeGRangesFromDataFrame(MER52A, keep.extra.columns=T)
Grange_MER52A <- Grange_MER52A[which(Grange_MER52A$Length > 1400),]

MER52C <- read.table("UCSCGenomeBrowser_hg38_MER52C.txt", stringsAsFactors=F, sep="\t")
MER52C$Length <- abs(MER52C$V2 - MER52C$V3)
colnames(MER52C)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
MER52C <- MER52C[which(MER52C$Chr %in% b),]
Grange_MER52C <- makeGRangesFromDataFrame(MER52C, keep.extra.columns=T)
Grange_MER52C <- Grange_MER52C[which(Grange_MER52C$Length > 1000),]

MER52D <- read.table("UCSCGenomeBrowser_hg38_MER52D.txt", stringsAsFactors=F, sep="\t")
MER52D$Length <- abs(MER52D$V2 - MER52D$V3)
colnames(MER52D)[1:6] <- c("Chr","Start","End","Name","Score","Strand")
MER52D <- MER52D[which(MER52D$Chr %in% b),]
Grange_MER52D <- makeGRangesFromDataFrame(MER52D, keep.extra.columns=T)
Grange_MER52D <- Grange_MER52D[which(Grange_MER52D$Length > 1000),]

#merge all elements in one Granges object.
Grange_TEs <- c(Grange_SVA_merged, Grange_AluYa5, Grange_AluYb8, Grange_AluYb9, Grange_AluYd8, Grange_AluYe5, Grange_AluYg6, 
                Grange_AluYi6, Grange_AluYk4, Grange_HERVK_int, Grange_L1HS, Grange_L1PA2, Grange_LTR12C, Grange_LTR12E,
                Grange_LTR22B1, Grange_LTR22C2, Grange_LTR5_Hs, Grange_LTR7, Grange_MER52A, Grange_MER52C, Grange_MER52D)


######     STRUCTURAL VARIANTS IN TRANSPOSABLE ELEMENTS     ######

#read in table of structural variants from Audano et al. 2019, remove alternative haplotypes and make a Granges object.
SVs <- read.table("Audano2019_SVs.txt", stringsAsFactors=F, sep="\t")
SVs$Length <- abs(SVs$V2 - SVs$V3)
colnames(SVs)[1:6] <- c("Chr","Start","End", "Name", "Variant", "Type")
SVs <- SVs[which(SVs$Chr %in% b),]
Grange_SVs <- makeGRangesFromDataFrame(SVs, keep.extra.columns=T)

#find the structural variants that are within transposable elements and make data frame
SVs_within_TEs <- findOverlaps(Grange_SVs, Grange_TEs, type="within")
SVs_within_TEs <- as.data.frame(SVs_within_TEs)

#remove the structural variants that are insertional polymorhpisms (length of structural variant is higher than 95% of length of transposable element)
SV_Variant <- Grange_SVs$Variant[SVs_within_TEs[,1]]
TE_Length <- Grange_TEs$Length[SVs_within_TEs[,2]]
SVs_within_TEs_2 <- SVs_within_TEs[-which(SV_Variant > TE_Length*0.95),]
Grange_TEs_within_95 <- Grange_TEs[SVs_within_TEs_2[,2]]
Grange_SVs_within_95 <- Grange_SVs[SVs_within_TEs_2[,1]]

#subset each class of transposable elements by the presence of structural variants
SVs_SVA <- subsetByOverlaps(Grange_SVA_merged, Grange_SVs_within_95)
SVs_SVAA <- subsetByOverlaps(Grange_SVA_A, Grange_SVs_within_95)
SVs_SVAB <- subsetByOverlaps(Grange_SVA_B, Grange_SVs_within_95)
SVs_SVAC <- subsetByOverlaps(Grange_SVA_C, Grange_SVs_within_95)
SVs_SVAD <- subsetByOverlaps(Grange_SVA_D, Grange_SVs_within_95)
SVs_SVAE <- subsetByOverlaps(Grange_SVA_E, Grange_SVs_within_95)
SVs_SVAF <- subsetByOverlaps(Grange_SVA_F, Grange_SVs_within_95)

SVs_AluYa5 <- subsetByOverlaps(Grange_AluYa5, Grange_SVs_within_95)
SVs_AluYb8 <- subsetByOverlaps(Grange_AluYb8, Grange_SVs_within_95)
SVs_AluYb9 <- subsetByOverlaps(Grange_AluYb9, Grange_SVs_within_95)
SVs_AluYd8 <- subsetByOverlaps(Grange_AluYd8, Grange_SVs_within_95)
SVs_AluYe5 <- subsetByOverlaps(Grange_AluYe5, Grange_SVs_within_95)
SVs_AluYg6 <- subsetByOverlaps(Grange_AluYg6, Grange_SVs_within_95)
SVs_AluYi6 <- subsetByOverlaps(Grange_AluYi6, Grange_SVs_within_95)
SVs_AluYk4 <- subsetByOverlaps(Grange_AluYk4, Grange_SVs_within_95)

SVs_HERVK_int <- subsetByOverlaps(Grange_HERVK_int, Grange_SVs_within_95)

SVs_L1HS <- subsetByOverlaps(Grange_L1HS, Grange_SVs_within_95)
SVs_L1PA2 <- subsetByOverlaps(Grange_L1PA2, Grange_SVs_within_95)

SVs_LTR12C <- subsetByOverlaps(Grange_LTR12C, Grange_SVs_within_95)
SVs_LTR12E <- subsetByOverlaps(Grange_LTR12E, Grange_SVs_within_95)
SVs_LTR22B1 <- subsetByOverlaps(Grange_LTR22B1, Grange_SVs_within_95)
SVs_LTR22C2 <- subsetByOverlaps(Grange_LTR22C2, Grange_SVs_within_95)
SVs_LTR5_Hs <- subsetByOverlaps(Grange_LTR5_Hs, Grange_SVs_within_95)
SVs_LTR7 <- subsetByOverlaps(Grange_LTR7, Grange_SVs_within_95)

SVs_MER52A <- subsetByOverlaps(Grange_MER52A, Grange_SVs_within_95)
SVs_MER52C <- subsetByOverlaps(Grange_MER52C, Grange_SVs_within_95)
SVs_MER52D <- subsetByOverlaps(Grange_MER52D, Grange_SVs_within_95)
